{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyspark.sql import SparkSession\n",
    "spark=SparkSession.builder.appName('lin_reg').getOrCreate()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyspark.ml.regression import LinearRegression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "df=spark.read.csv('linear_reg.csv',inferSchema=True,header=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(1150, 6)\n"
     ]
    }
   ],
   "source": [
    "print((df.count(), len(df.columns)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- var1: integer (nullable = true)\n",
      " |-- var2: integer (nullable = true)\n",
      " |-- var3: integer (nullable = true)\n",
      " |-- var4: double (nullable = true)\n",
      " |-- var5: double (nullable = true)\n",
      " |-- result: double (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+-----------------+-----------------+------------------+--------------------+--------------------+--------------------+\n",
      "|summary|             var1|             var2|              var3|                var4|                var5|              result|\n",
      "+-------+-----------------+-----------------+------------------+--------------------+--------------------+--------------------+\n",
      "|  count|             1150|             1150|              1150|                1150|                1150|                1150|\n",
      "|   mean|643.6973913043478|644.2704347826087| 71.62608695652175|  0.3191382608695646| 0.15598347826086953|  0.3919947826086955|\n",
      "| stddev|63.10662570038112|64.78680908019999|13.753493658264567|0.030429290211964388|0.010494753386318759|0.034980138199992294|\n",
      "|    min|              460|              461|                40|               0.161|                0.11|               0.301|\n",
      "|    max|             1009|             1103|               116|               0.369|               0.194|               0.491|\n",
      "+-------+-----------------+-----------------+------------------+--------------------+--------------------+--------------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df.describe().show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[Row(var1=634, var2=666, var3=61, var4=0.316, var5=0.159, result=0.416),\n",
       " Row(var1=600, var2=600, var3=94, var4=0.31, var5=0.146, result=0.369),\n",
       " Row(var1=611, var2=605, var3=93, var4=0.311, var5=0.146, result=0.416)]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head(3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyspark.sql.functions import corr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------------------+\n",
      "|corr(var1, result)|\n",
      "+------------------+\n",
      "| 0.512530311949889|\n",
      "+------------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df.select(corr('var1','result')).show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyspark.ml.linalg import Vector\n",
    "from pyspark.ml.feature import VectorAssembler"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['var1', 'var2', 'var3', 'var4', 'var5', 'result']"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "vec_assmebler=VectorAssembler(inputCols=['var1', 'var2', 'var3', 'var4', 'var5'],outputCol='features')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "features_df=vec_assmebler.transform(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "root\n",
      " |-- var1: integer (nullable = true)\n",
      " |-- var2: integer (nullable = true)\n",
      " |-- var3: integer (nullable = true)\n",
      " |-- var4: double (nullable = true)\n",
      " |-- var5: double (nullable = true)\n",
      " |-- result: double (nullable = true)\n",
      " |-- features: vector (nullable = true)\n",
      "\n"
     ]
    }
   ],
   "source": [
    "features_df.printSchema()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------------------------------+\n",
      "|features                      |\n",
      "+------------------------------+\n",
      "|[634.0,666.0,61.0,0.316,0.159]|\n",
      "|[600.0,600.0,94.0,0.31,0.146] |\n",
      "+------------------------------+\n",
      "only showing top 2 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "features_df.select('features').show(2,truncate=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "model_df=features_df.select('features','result')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+------------------------------+------+\n",
      "|features                      |result|\n",
      "+------------------------------+------+\n",
      "|[634.0,666.0,61.0,0.316,0.159]|0.416 |\n",
      "|[600.0,600.0,94.0,0.31,0.146] |0.369 |\n",
      "|[611.0,605.0,93.0,0.311,0.146]|0.416 |\n",
      "|[634.0,606.0,69.0,0.315,0.16] |0.415 |\n",
      "|[613.0,659.0,61.0,0.301,0.14] |0.366 |\n",
      "+------------------------------+------+\n",
      "only showing top 5 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "model_df.show(5,False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_df,test_df=model_df.randomSplit([0.7,0.3])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+-------+-------------------+\n",
      "|summary|             result|\n",
      "+-------+-------------------+\n",
      "|  count|                806|\n",
      "|   mean| 0.3923002481389583|\n",
      "| stddev|0.03448916185778715|\n",
      "|    min|               0.31|\n",
      "|    max|              0.469|\n",
      "+-------+-------------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "train_df.describe().show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "lin_reg=LinearRegression(featuresCol='features',labelCol='result')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "lr_model=lin_reg.fit(train_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "方程截距:-0.0448741221007\n",
      "方程参数系数:[6.201299484897484e-05,0.00010179604468040109,0.0004519575988047111,0.1427404941053289,1.6236235191044002]\n"
     ]
    }
   ],
   "source": [
    "print('{}{}'.format('方程截距:',lr_model.intercept)) \n",
    "print('{}{}'.format('方程参数系数:',lr_model.coefficients))  # 回归方程中的中自变量的系数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[6.201299484897484e-05,0.00010179604468040109,0.0004519575988047111,0.1427404941053289,1.6236235191044002]\n"
     ]
    }
   ],
   "source": [
    "print(lr_model.coefficients)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "training_predictions=lr_model.evaluate(train_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "均方误差:0.000493787041997\n",
      "R2判定系数：0.584363604835\n"
     ]
    }
   ],
   "source": [
    "training_predictions.meanSquaredError\n",
    "print('{}{}'.format('均方误差:',training_predictions.meanSquaredError))            # 误差值差值平方   \n",
    "print('{}{}'.format('R2判定系数：',training_predictions.r2 ))  # r2 判定系数,用来判定，构建的模型是否能够准确的预测,越大说明预测的准确率越高"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.5843636048352541"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "training_predictions.r2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_results=lr_model.evaluate(test_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------+\n",
      "|           residuals|\n",
      "+--------------------+\n",
      "| 0.04057100269836972|\n",
      "|0.020256134733585784|\n",
      "| 0.01987874430544695|\n",
      "|-0.01024864302487...|\n",
      "|0.003389702064237...|\n",
      "|0.004924344039997364|\n",
      "|0.024724843364441307|\n",
      "| 0.04313982591419813|\n",
      "|0.012915417413048857|\n",
      "|-0.00913028485932...|\n",
      "+--------------------+\n",
      "only showing top 10 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "test_results.residuals.show(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.5495490308189562"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_results.r2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.024223963230906198"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_results.rootMeanSquaredError"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.0005868003946122955"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_results.meanSquaredError"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
